WO 01/40466 



PCT7US00/32678 



Table 1 (conf) 



35 



^include <stdio.h> 
^include <ctype.h> 



^define 


MAXJMP 




/* max jumps in a diag */ 


^define 


MAXGAP 


24 


/* don't continue to penalize gaps larger than this */ 


#define 


JMPS 


1024 


/* max jmps in an path */ 


^define 


MX 


4 


/* save if there's at least MX-1 bases since last jmp */ 


#define 


DMAT 


3 


/* value of matching bases */ 


#define 


DMIS 


0 


/* penalty for mismatched bases */ 


#define 


DINSO 


8 


/* penalty for a gap */ 


^define 


DINS1 


1 


/* penalty per base */ 


^define 


PINSO 


8 


/* penalty for a gap */ 


#define 


PINS1 


4 


/* penalty per residue */ 



struct jmp { 

short 

unsigned short 



nfMAXJMP]; 
x[MAXJMP]; 



struct diag { 



short 
struct jmp 



score; 
offset; 
ijmp; 



/* size of jmp (neg for dely) */ 
/* base no. of jmp in seq x */ 
/* limits seq to T~ 16-1 */ 



/* score at last jmp */ 
/* offset of prev block */ 
/* current jmp index */ 
/* list of jmps */ 



30 struct path { 



spc; /* number of leading spaces * 

nfJMPS];/* size of jmp (gap) */ 

x[JMPS];/* loc of jmp (last elem before gap) */ 



char 
char 
char 
char 



long 

struct 

struct 



*ofile; 

*namex[2]; 

*prog; 

*seqx[2]; 

dmax; 

dmaxO; 

dna; 

endgaps; 
gapx, gapy; 
lenO, lenl; 
ngapx, ngapy; 
smax; 
*xbm; 
offset; 
*dx; 
PP[2]; 



/* output file name */ 

/* seq names: getseqs() */ 

/* prog name for err msgs */ 

/* seqs: getseqs() */ 

/* best diag: nw() */ 

/* final diag */ 

/* set if dna: main() */ 

/* set if penalizing end gaps */ 

/* total gaps in seqs */ 

/* seq lens */ 

/* total size of gaps */ 

/* max score: nw() */ 

/* bitmap for matching */ 

/* current offset in jmp file */ 

/* holds diagonals */ 

/* holds path for seqs */ 



*calloc(), *malloc(), *index(), *strcpy(); 
*getseq(), *g_calloc(); 



60 



49 



WO 01/40466 



PCT7US00/32678 



Table 1 (conf) 

/* Needleman-Wunsch alignment program 

* usage: progs filel file2 

* where filel and file2 are two dna or two protein sequences. 

* The sequences can be in upper- or lower-case an may contain ambiguity 

* Any lines beginning with ';','>' or ' < ' are ignored 

* Max file length is 65535 (limited by unsigned short x in the jmp struct) 

* A sequence with 1/3 or more of its elements ACGTU is assumed to be DNA 

* Output is in the file "align.out" 

* The program may create a tmp file in /tmp to hold info about traceback. 

* Original version developed under BSD 4.3 on a vax 8650 
*/ 

^include "nw.h" 
^include "day.h" 

static _dbval[26] = { 

1, 14,2, 13,0,0,4, 1 1 ,0,0, 12,0,3, 15,0,0,0,5,6,8,8,7,9,0,10,0 

}; 

static _pbval[26] = { 

1, 2|(1< <('D'- , A'))|(1< <( , N , -'A')), 4, 8, 16, 32, 64, 
128, 256, OxFFFFFFF, 1< < 10, 1< < 11, 1< < 12, 1< < 13, 1< < 14, 
1<<15, 1<<16, 1<<17, 1<<18, 1<<19, 1< <20, 1<<21, 1< <22, 
1< <23, 1< <24, 1< <25 |(1< <('E'-'A , ))|(1 < <('Q'-'A')) 

}; 

main(ac, av) main 
int ac; 
char *av[]; 

{ 

prog = av[0]; 
if(ac != 3){ 

fprintf(stderr, "usage: %s filel file2\n", prog); 

fprintf(stderr, "where filel and file2 are two dna or two protein sequences.\n"); 
fprintf(stderr,"The sequences can be in upper- or lower-case\n"); 
fprintf(stderr,"Any lines beginning with ';' or ' < ' are ignored\n"); 
fprintf(stderr, "Output is in the file \"align.out\"\n"); 
exit(l); 

} 

namex[0] = av[l]; 

namex[l] = av[2]; 

seqx[0] = getseq(namex[0], &len0); 

seqxfl] = getseq(namex[l], &lenl); 

xbm = (dna)? .dbval : jibval; 

endgaps =0; /* 1 to penalize endgaps */ 

ofile = "align.out"; /* output file */ 

nw(); /* fill in the matrix, get the possible jmps */ 

readjmps(); /* get the actual jmps */ 

print(); /* print stats, alignment */ 

cleanup(O); /* unlink any tmp files */ 

} 



50 



WO 01/40466 



PCT7US00/32678 



Table 1 (conf) 

/* do the alignment, return best score: main() 

* dna: values in Fitch and Smith, PNAS, 80, 1382-1386, 1983 

* pro: PAM 250 values 

* When scores are equal, we prefer mismatches to any gap, prefer 



* a new gap to extending an ongoing gap, and prefer a gap in seqx 


* to a gap in seq y. 






*/ 






nw() 
{ 






char 


*px, *py; 


/* seqs and ptrs */ 


int 


*ndely, *dely; 


/* keep track of dely */ 


int 


ndelx, delx; 


/* keep track of delx */ 


int 




/* for swapping rowO, rowl */ 


int 


mis; 


/* score for each type */ 


int 


insO, insl; 


/* insertion penalties */ 


register 


id; 


/* diagonal index */ 


register 


u; 


/* jmp index */ 


register 


*col0, *coll; 


/* score for curr, last row */ 


register 


xx, yy; 


/* index into seqs */ 



dx = (struct diag *)g_calloc("to get diags", lenO+lenl + 1, sizeof(struct diag)); 

ndely = (int *)g_calloc("to get ndely", lenl + 1 , sizeof(int)); 
dely = (int *)g_calloc("to get dely", lenl + 1, sizeof(int)); 
colO = (int *)g_calloc("to get colO", lenl + 1, sizeof(int)); 
coll = (int *)g_calloc("to get coll", lenl + 1, sizeof(int)); 
insO = (dna)? DINSO : PINSO; 
insl = (dna)? DINS1 : PINS1; 

smax = -10000; 
if (endgaps) { 

for (col0[0] = dely[0] = -insO, yy = 1; yy < = lenl; yy+ + ) { 
col0[yy] = dely[yy] = colO[yy-l] - insl; 
ndely [yy] = yy; 

} 

col0[0] =0; /* Waterman Bull Math Biol 84 */ 

} 

else 

for (yy = 1; yy < = lenl; yy+ +) 
delyfyy] = -insO; 

/* fill in match matrix 
*/ 

for (px = seqx[0], xx = 1; xx < = lenO; px+ +, xx+ +) { 
/* initialize first entry in col 

*/ 

if (endgaps) { 

if (xx ==1) 

coll[0] = delx = -(insO+insl); 

else 

coll[0] = delx = colO[0] - insl; 
ndelx = xx; 

} 

else{ 

coll[0] = 0; 
delx = -insO; 
ndelx = 0; 

} 



51 



WO 01/40466 



PCT7US00/32678 



Table 1 (conf) 

for (py = seqx[l], yy = 1; yy < = lenl; py++, yy++) { 
mis = col0[yy-l]; 
if (dna) 

mis + = (xbm[*px-'A']&xbm[*py-'A'])? DMAT : DMIS; 

else 

mis += _day[*px-'A'][*py-'A']; 

/* update penalty for del in x seq; 

* favor new del over ongong del 

* ignore MAXGAP if weighting endgaps 
*/ 

if (endgaps | | ndelyfyy] < MAXGAP) { 

if (colOfyy] - insO > = dely[yy]) { 

delyfyy] = colOfyy] - (insO+insl); 
ndely[yy] = 1; 

} else { 

dely[yy] -= insl; 
ndelyfyy] + + ; 

} 

} else { 

if (col0[yy] - (insO+insl) > = delyfyy]) { 
dely[yy] = col0[yy] - (insO+insl); 
ndelyfyy] = 1; 

} else 

ndelyfyy] + +; 

} 

/* update penalty for del in y seq; 

* favor new del over ongong del 
*/ 

if (endgaps | j ndelx < MAXGAP) { 

if (coll[yy-l] - insO > = delx) { 

delx = colltyy-1] - (insO+insl); 
ndelx = 1; 

} else { 

delx - insl; 
ndelx++; 

} 

} else { 

if (coll[yy-l] - (insO+insl) > = delx) { 
delx = coll[yy-l] - (insO + insl); 
ndelx = 1; 

} else 

ndelx++; 

} 

/* pick the maximum score; we're favoring 

* mis over any del and delx over dely 



52 



WO 01/40466 



PCT7US00/32678 



Table 1 (conf) 

id = xx - yy + lenl - 1 ; 
if (mis > = delx && mis > = dely[yy]) 
coll[yy] = mis; 

5 else if (delx > = delyfyy]) { 

coll[yy] = delx; 
ij = dx[id].ijmp; 

if (dx[id].jp.n[0] && (!dna 1 1 (ndelx > = MAXJMP 
&&xx > dx[id].jp.x[ij]+MX) 1 1 mis > dx[id]. score +DINS0)) { 
10 dx[id].ijmp++; 

if (++ij >= MAXJMP) { 
writejmps(id); 
ij = dx[id).ijmp = 0; 
dx[id].offset = offset; 

15 offset += sizeof(struct jmp) + sizeof(offset); 

} 

} 

dx[id].jp.n[ij] = ndelx; 
dx[id].jp.x[ij] = xx; 

20 dxfid]. score = delx; 

} 

else { 

collfyy] = delyfyy]; 
ij = dx[id].ijmp; 

25 if (dx[id].jp.n[0] && (!dna | | (ndelyfyy] > = MAXJMP 

&& xx > dx[id].jp.x[ij]+MX) | | mis > dx[id]. score +DINS0)) { 
dx[id].ijmp+ + ; 
if(++ij > = MAXJMP) { 
writejmps(id); 

30 ij = dx[id].ijmp = 0; 

dx[id]. offset = offset; 

offset + = sizeof(struct jmp) + sizeof(offset); 

} 

} 

35 dxfid] Jp.nfij] = -ndely[yy]; 

dx[id].jp.x[ij] = xx; 
dx[id]. score = delyfyy]; 

} 

if (xx = = lenO && yy < lenl) { 
40 /* last col 

*/ 

if (endgaps) 

collfyy] -= ins0+insl*(lenl-yy); 
if (coll [yy] > smax) { 
45 smax = collfyy]; 

dmax = id; 

} 

} 

50 if (endgaps && xx < lenO) 

coll[yy-l] -= ins0+insl*(len0-xx); 
if (coll[yy-l] > smax) { 

smax = coll[yy-l]; 
dmax = id; 

55 } 

tmp = colO; colO = coll; coll = tmp; 

} 

(void) free((char *)ndely); 
(void) free((char *)dely); 
60 (void) free((char *)col0); 

(void) free((char *)coll); 



53 



WO 01/40466 



PCT7US00/32678 



Table 1 (conf) 



* print() — only routine visible outside this module 



* getmat() - trace back best path, count matches: print() 

* pr_align() - print alignment of described in array pfj: print() 

* dumpblockO -- dump a block of lines with numbers, stars: pr_align() 

* numsO — put out a number line: dumpblockO 

* putline() - put out a line (name, [num], seq, [num]): dumpblockO 

* stars() - -put a line of stars: dumpblockO 

* stripnameO - strip any path and prefix from a seqname 



55 



#define SPC 3 
#define P_LINE 256 
#define P SPC 3 



extern 
int 

FILE 



_day[26][26]; 
olen; 



/* maximum output line */ 

/* space between name or num and seq */ 



/* set output line length */ 
/* output file */ 



print() 
{ 



int lx, ly, firstgap, lastgap; /* overlap */ 

if ((fx = fopen(ofile, "w")) = = 0) { 

fprintf(stderr,"%s: can't write %s\n", prog, ofile); 
cleanup(l); 

} 

fprintf(fx, "< first sequence: %s (length = %d)\n", namex[0], lenO); 
fprintf(fx, "< second sequence: %s (length = %d)\n", namex[l], lenl); 
olen = 60; 
lx = lenO; 
ly = lenl; 

firstgap = lastgap = 0; 

if (dmax < lenl - 1) { /* leading gap in x */ 
pp[0].spc = firstgap = lenl - dmax -. 1; 
ly-= pp[0].spc; 

} 

else if (dmax > lenl - 1) { /* leading gap in y */ 
pp[l].spc = firstgap = dmax - (lenl - 1); 
lx-= pp[l].spc; 

} 

if (dmaxO < lenO - 1) { /* trailing gap in x */ 
lastgap = lenO - dmaxO -1; 
lx -= lastgap; 

} 

else if (dmaxO > lenO - 1) { /* trailing gap in y */ 
lastgap = dmaxO - (lenO - 1); 
ly -= lastgap; 

} 

getmat(lx, ly, firstgap, lastgap); 
pr_align(); 



print 



60 



54 



WO 01/40466 



PCT7US00/32678 



Table 1 fcont') 

/* 

* trace back the best path, count matches 

*/ 

static 

getmat(lx, ly, firstgap, lastgap) getmat 
int lx, ly; /* "core" (minus endgaps) */ 

int firstgap, lastgap; /* leading trailing overlap */ 

{ 

int nm, iO, il, sizO, sizl; 

char outx[32]; 
double pet; 
register nO, nl; 

register char *p0, *pl; 

/* get total matches, score 
*/ 

iO = il = sizO = sizl = 0; 
pO = seqx[0] + pp[l].spc; 
pi = seqx[l] + pp[0].spc; 
nO = pp[l].spc + 1; 
nl = pp[0].spc + 1; 

nm = 0; 

while (*p0 &&*pl){ 
if(sizO) { 

pl + +; 



30 else if (sizl) { 

p0++; 



} 

else{ 



n0+ + 
sizl--; 



if (xbm[*pO-' A , ]&xbm[*pl- , A , ;|) 

nm++; 
if (n0++ == pp[0].x[iO]) 

sizO = pp[0].n[iO++]; 
if(nl + + ==pp[l].x[il]) 

sizl = pp[l].n[il + +]; 

p0++; 
pl + +; 



/* pet homology: 

* if penalizing endgaps, base is the shorter seq 

* else, knock off overhangs and take shorter con 



lx = (lenO < lenl)? lenO : lenl; 

else 

lx = (lx < ly)? lx : ly; 
pet = 100.*(double)nm/(double)lx; 

fprintf(fx, "\n"); 

fprintf(fx, " < %d match%s in an overlap of %d: % .2f percent similarity\n", 
nm, (nm = = 1)? "" : "es", lx, pet); 



55 



WO 01/40466 



PCT7US00/32678 



Table 1 (conf) 

fprintf(fx, " < gaps in first sequence: %d", gapx); . . .getmat 

if (gapx) { 

(void) sprintf(outx, " (%d %s%s)", 
5 ngapx, (dna)? "base": "residue", (ngapx == 1)? "":"s"); 

rprintf(fx,"%s", outx); 

fprintftfx, ", gaps in second sequence: %d", gapy); 
if(gapy){ 

10 (void) sprintf(outx, " (%d %s%s)", 

ngapy, (dna)? "base":"residue", (ngapy == 1)? "":"s"); 
fprintf(fx,"%s", outx); 

} 

if (dna) 

15 rprintf(fx, 

"\n<score: %d (match = %d, mismatch = %d, gap penalty = %d + %d per base)\n", 
smax, DMAT, DMIS, DINSO, DINS1); 

else 

fprintf(fx, 

20 "\n<score: %d (Dayhoff PAM 250 matrix, gap penalty = %d + %d per residue)\n", 

smax, PINS0, P1NS1); 
if (endgaps) 

fprintf(fx, 

"< endgaps penalized, left endgap: %d %s%s, right endgap: %d %s%s\n", 
25 firstgap, (dna)? "base" : "residue", (firstgap == 1)? "" : "s", 

lastgap, (dna)? "base" : "residue", (lastgap == 1)? "" : "s"); 



else 



fprintf(fx, "< endgaps not penalizedW); 



static nm; /* matches in core ~ for checking */ 

static lmax; /* lengths of stripped file names */ 

static ij[2]; /* jmp index for a path */ 

static nc[2]; /* number at start of current line */ 

35 static ni[2]; /* current elem number - for gapping */ 

static siz[2]; 

static char *ps[2]; /* ptr to current element */ 

static char *po[2]; /* ptr to next output char slot */ 

static char out[2][P_LINE]; /* output line */ 

40 static char star[P_LINE]; /* set by stars() */ 

/* 

* print alignment of described in struct path pp[] 

*/ 

45 static 

praiigno pr_align 

{ 

int nn; /* char count */ 

int more; 

50 register i; 

for (i = 0, lmax = 0; i < 2; i+ +) { 
nn = stripname(namex[i]); 
if (nn > lmax) 
55 lmax = nn; 

nc[i] = 1; 
ni[i] = 1; 
siz[i] = ij[i] = 0; 
60 ps[i] = seqxfi]; 

po[i] = out[i]; } 



56 



WO 01/40466 



PCT7US00/32678 



35 



Table 1 (conf) 

for (nn = ran = 0, more = 1 ; more; ) { ...pr_align 

for (i = more = 0; i < 2; i+ +) { 

/* 

* do we have more of this sequence? 

*/ 

if(!*ps[i]) 



more++; 

if (pp[i].spc) { /* leading space */ 
*poW + + = ' '; 
pp[i].spc~; 

} 

else if (siz[i]) { /* in a gap */ 
*po[i] + + = 
siz[i]— ; 



} 



} 



e { /* we're putting a seq element 

*/ 

*po[i] = *ps[i]; 
if (islower(*ps[i]» 

*ps[i] = toupper(*ps[i]); 

po[i] + +; 
ps[i] + + ; 



* are we at next gap for this seq? 
*/ 

if(ni[i] ==pp[i].x[ij[i]]){ 

/* 

* we need to merge all gaps 

* at this location 
*/ 

siz[i] = pp[i].n[ij[i]++]; 
while (ni[i] ==pp[i].x[ij[i]]) 

siz[i] += pp[i].n[ij[i]++]; 

} 

ni[i] + + ; 



} 

if (+ +nn = = olen | | !more && nn) { 
dumpblock(); 
for (i = 0; i < 2; i++) 
po[i] = out[i]; 

nn = 0; 



* dump a block of lines, including numbers, stars: pr_align() 



static 

dumpbiocko dumpblock 



for (i = 0; i < 2; i++) 
*po[i]_ = '\0'; 



57 



WO 01/40466 



PCT7US00/32678 



Table 1 (conV) 



(void) putc('\n', fx); 
for(i = 0; i < 2; i++) { 

if (*out[i] &&(*out[i] ! 
if(i==0) 



' || *(po[i])!= 1 ')){ 



nums(i)'; 
if (i = = 0&& *out[l]) 
stars(); 

putline(i); 

if (i == 0 && *out[l]) 
fprintf(fx, star); 

if(i== 1) 

nums(i); 



..dumpblock 



* put out a number line: dumpblock() 



static 

nums(ix) 



char 
register 
register char 



/* index in out[] holding seq line */ 
nline[P_LINE]; 
*pn, *px, *py; 
0; i < lmax+P_SPC; i++, pn++) 



for (pn = nline, i = 
*pn = ' '; 

for (i = nc[ix], py = out[ix]; *py; py+ +, pn+ +) { 

if(*py == ' ' || *py== '-') 



if (i%10 = = 0 |[ (i== l&&nc[ix]! = 
j = (i < 0)? -i : i; 
for (px = pn; j; j /= 10, px~) 
*px = j%10 + '0'; 

if (i < 0) 



} 

else 



*px = 



} 

*pn = '\0'; 
nc[ix] = i; 

for (pn = nline; *pn; pn+ +) 
(void) putc(*pn, fx); 
(void) putc('\n', fx); 



* put out a line (name, [num], seq, [num]): dumpblock() 

*/ 

static 

putline(ix) 



putline 



58 



Table 1 (conf) 



register char *px; 

for (px = namexfix], i = 0; *px && *px != ':'; px+ + , i++) 

(void) putc(*px, fx); 
for (; i < lmax+P_SPC; i++) 

(void) putcO ', fx); 

/* these count from 1 : 

* ni[] is current element (from 1) 

* nc[] is number at start of current line 

*/ 

for (px = outfix]; *px; px+ +) 

(void) putc(*px&0x7F, fx); 
(void) putc('\n', fx); 



* put a line of stars (seqs always in out[0], out[l]): dumpblock() 

*/ 

static 

stars() 
{ 



30 if (!*out[0] | | (*out[0] = ='•&& *(po[0]) = = ' ') 1 1 

!*out[l] | | <*out[l] = = " && *(po[!]) = ="')) 
return; 
px = star; 

for (i = lmax + P_SPC; i; i--) 
35 *px++ = ' '; 

for (pO = out[0J, pi = out[l]; *p0 && *pl; p0+ + , pl + +) { 
if (isalpha(*pO) && isalpha(*pl)) { 

40 if(xbm[*pO-'A']&xbm[*pl-'A , l){ 

nm+ +; 

} 

else if (!dna && day[*pO-'A'][*pl-'A'] > 0) 
45 cx = V; 



PCT7US00/32678 



...putline 



50 



55 } 



} 

else 

cx = ' 
*px++ = cx; 

} 

*px++ = '\n'; 
*px = '\0'; 



59 



WO 01/40466 



PCT7US00/32678 



25 



40 



Table 1 (conf) 

/* 

* strip path or prefix from pn, return len: pr_align() 
*/ 

static 

stripname(pn) stripname 

char *pn; /* file name (may be path) */ 

{ 

register char *px, *py; 
py = 0; 

for (px = pn; *px; px++) 
if (*px == V) 

py = px + 1; 

if(py) 

(void) strcpy(pn, py); 
return(strlen(pn)); 



60 



60 



WO 01/40466 



PCT7US00/32678 



Table 1 (conf) 



* cleanupO — cleanup any tmp file 

* getseq() - read in seq, set dna, len, maxlen 

* g_calloc() — calloc() with error checkin 

* readjmpsO -- get the good jmps, from tmp file if necessary 

* writejmpsO -- write a filled array of jmps to a tmp file: nw() 



# include "nw.h" 
^include <sys/file.h> 



char 
FILE 



*jname = 7tmp/homgXXXXXX"; 



cleanupO; 
lseekQ; 



/* tmp file for jmps */ 
/* cleanup tmp file */ 



20 



* remove any tmp file if we blow 
*/ 

cleanup© 



cleanup 



(void) unlink(jname); 



* read, return ptr to seq, set dna, len, maxlen 

* skip lines starting with ';','<', or ' > ' 

* seq in upper or lower case 



getseq(file, len) 

char *file; 
int *len; 



{ 



register char 
int 

FILE 



/* file name */ 
/* seq len */ 

line[1024], *pseq; 
*px, *py; 
natgc, tlen; 



getseq 



if ((fp = fopen(file,"r")) = = 0) { 

fprintf(stderr,"%s: can't read %s\n", prog, file); 
exit(l); 

} 

tlen = natgc = 0; 

while (fgets(line, 1024, fp)) { 

if (*line == ';' 1 1 *line = = '<' 1 1 *line == '>') 

continue; 
for (px = line; *px != '\n'; px++) 

if (isupper(*px) 1 1 islower(*px)) 
tlen++; 

} 

if ((pseq = malloc((unsigned)(tlen+6))) == 0) { 

fprintf(stderr," %s: malloc() failed to get %d bytes for %s\n", prog, tlen+6, file); 
exit(l); 

} 

pseq[0] = pseqfl] = pseq[2] = pseq[3] = '\0'; 



61 



WO 01/40466 



PCT7US00/32678 



Table 1 (conf) 

py = pseq + 4; 
*len = tlen; 
rewind(fp); 

while (fgets(line, 1024, fp)) { 

if (*line == ';' 1 1 *line == '<' || *Iine == '>') 



...getseq 



for (px = line; *px != '\n'; px++) { 
if (isupper(*px)) 

*py++ = *px; 
else if (islower(*px)) 

*py++ = toupper(*px); 
if (index("ATGCU",*(py-l))) 



} 

} 

*py++ = '\0'\ 

*py = '\0'; 
20 (void) fclose(fp); 



} 



dna = natgc > (tlen/3); 
return(pseq+4); 



25 char * 

g_calloc(msg, nx, sz) g_Call0C 
char *msg; /* program, calling routine */ 

nx, sz; /* number and size of elements */ 



{ 



char *px, *calloc(); 

if ((px = calloc((unsigned)nx, (unsigned)sz)) = = 0) { 
if(*msg){ 

fprintf(stderr, "%s: g_calloc() failed %s (n=%d, sz = %d)\n", prog, msg, nx, sz); 
exit(l); 

} 

} 

return(px); 



* get final jmps from dx[] or tmp file, set pp[], reset dmax: main() 
*/ 

readjmpso readjmps 
45 { 

int fd = -1; 

int siz, iO, il; 

register i, j, xx; 

50 if(fj){ 

(void) fclose(fj); 

if ((fd = open(jname, 0_RDONLY, 0)) < 0) { 

fprintf(stderr, "%s: can't open() %s\n", prog, jname); 
cleanup(l); 

55 } 
} 

for (i = iO = il = 0, dmaxO = dmax, xx = lenO; ; i++) { 
while (1) { 

for (j = dx[dmax].ijmp; j > = 0 && dx[dmax].jp.x[j] > = xx; j-) 



62 



WO 01/40466 



PCT7US00/32678 



Table 1 (conf) 

...readjmps 

if (j < 0 && dx[dmax] .offset && fj) { 

(void) lseek(fd, dx[dmax]. offset, 0); 
(void) read(fd, (char *)&dx[dmax].jp, sizeof(struct jmp)); 
5 (void) read(fd, (char *)&dx[dmax] . offset, sizeof(dx[dmax].offset)); 

dx[dmax].ijmp = MAXJMP-1; 

} 

else 

break; 

10 } 

if(i > = JMPS){ 

fprintf(stderr, "%s: too many gaps in alignment^", prog); 
cleanup(l); 

15 if a > = 0) { 

siz = dx[dmax].jp.n[j]; 
xx = dx[dmax].jp.x[j]; 
dmax + = siz; 

if (siz < 0) { /* gap in second seq */ 

20 pp[l].n[il] = -siz; 

xx + = siz; 

/* id = xx - yy + lenl - 1 

*/ 

pp[l].x[il] = xx - dmax + lenl - 1; 
25 gapy++; 

ngapy -= siz; 
/* ignore MAXGAP when doing endgaps */ 

siz = (-siz < MAXGAP | | endgaps)? -siz : MAXGAP; 
il + + ; 

30 } 

else if (siz > 0) { /* gap in first seq */ 
pp[0].n[i0] = siz; 
pp[0].x[i0] = xx; 
gapx+ + ; 

35 ngapx + = siz; 

/* ignore MAXGAP when doing endgaps */ 

siz = (siz < MAXGAP | | endgaps)? siz : MAXGAP; 
i0+ + ; 

40 } } 

else 

break; 

} 

45 /* reverse the order of jmps 

*/ 

for (j = 0, i0~; j < iO; j + + , i0~) { 

i = pp[0].n[j]; pp[0].n[j] = pp[0].n[i0]; pp[0].n[i0] = i; 
i = pp[0].x[j]; pp[0].x[j] = pp[0].x[i0]; pp[0].x[i0] = i; 

50 } 

for (j = 0, il — ; j < il; j++, il~) { 

i = PP[l]-n[j]; pp[l].n(j] = pp[l].n[il]; pp[l].n[il] = i; 
i = pp[l].x[j]; pp[l].x(j] = pp[l].x[il]; pp[l].x[il] = i; 

} 

55 if (fd > = 0) 

(void) close(fd); 

if(fj){ 

(void) unlink(jname); 
fj =0; 

60 offset = 0; 

} } 



63 



WO 01/40466 



PCT7US00/32678 



Table 1 (conV) 

/* 

* write a filled jmp struct offset of the prev one (if any): nw() 

*/ 

writejmps(ix) writejmps 

int ix; 

{ 

char *mktemp(); 
if(!fj){ 

if (mktemp(jname) < 0) { 

fprintf(stderr, "%s: can't mktempO %s\n", prog, j name); 
cleanup(l); 

} 

if ((fj = fopen(jname, "w")) = = 0) { 

fprintf(stderr, "%s: can't write %s\n", prog, jname); 
exit(l); 

} 

} 

(void) fwrite((char *)&dx[ix].jp, sizeof(struct jmp), 1, fj); 
(void) fwrite((char *)&dx[ix]. offset, sizeof(dx[ix]. offset), 1, fj); 



35 



64 



WO 01/40466 



PCT7US00/32678 



Table 2 

PRO XXXXXXXXXXXXXXX (Length = 15 amino acids) 

Comparison Protein XXXXXYYYYYYY (Length = 12 amino acids) 

5 % amino acid sequence identity 

( the number of identically matching amino acid residues between the two polypeptide sequences as determined 
by ALIGN-2) divided by (the total number of amino acid residues of the PRO polypeptide) 

10 5 divided by 15 = 33.3% 

Table 3 

15 PRO XXXXXXXXXX (Length = 10 amino acids) 

Comparison Protein XXXXXYYYYYYZZYZ (Length = 15 amino acids) 

% amino acid sequence identity = 

20 (the number of identically matching amino acid residues between the two polypeptide sequences as determined 
by ALIGN-2) divided by (the total number of amino acid residues of the PRO polypeptide) = 

5 divided by 10 = 50% 



65 



WO 01/40466 



PCT7US00/32678 



Table 4 

PRO-DNA NNNNNNNNNNNNNN (Length = 14 nucleotides) 

Comparison DNA NNNNNNLLLLLLLLLL (Length = 16 nucleotides) 

5 % nucleic acid sequence identity = 

(the number of identically matching nucleotides between the two nucleic acid sequences as determined by 
ALIGN-2) divided by (the total number of nucleotides of the PRO-DNA nucleic acid sequence) = 

10 6 divided by 14 = 42.9% 



Table 5 

15 PRO-DNA NNNNNNNNNNNN (Length = 12 nucleotides) 

Comparison DNA NNNNLLLVV (Length = 9 nucleotides) 

% nucleic acid sequence identity = 

20 (the number of identically matching nucleotides between the two nucleic acid sequences as determined by 
ALIGN-2) divided by (the total number of nucleotides of the PRO-DNA nucleic acid sequence) = 

4 divided by 12 = 33.3% 



66 



WO 01/40466 



PCT7US00/32678 



II. Compositions and Methods of the Invention 

A. Full-Length PRO Polypeptides 

The present invention provides newly identified and isolated nucleotide sequences encoding polypeptides 
referred to in the present application as PRO polypeptides. In particular, cDNAs encoding various PRO 
polypeptides have been identified and isolated, as disclosed in further detail in the Examples below. It is noted 
5 that proteins produced in separate expression rounds may be given different PRO numbers but the UNQ number 
is unique for any given DNA and the encoded protein, and will not be changed. However, for sake of 
simplicity, in the present specification the protein encoded by the full length native nucleic acid molecules 
disclosed herein as well as all further native homologues and variants included in the foregoing definition of 
PRO, will be referred to as "PRO/number", regardless of their origin or mode of preparation. 

10 As disclosed in the Examples below, various cDNA clones have been deposited with the ATCC. The 

actual nucleotide sequences of those clones can readily be determined by the skilled artisan by sequencing of the 
deposited clone using routine methods in the art. The predicted amino acid sequence can be determined from 
the nucleotide sequence using routine skill. For the PRO polypeptides and encoding nucleic acids described 
herein, Applicants have identified what is believed to be the reading frame best identifiable with the sequence 

15 information available at the time. 

B. PRO Polypeptide Variants 

In addition to the full-length native sequence PRO polypeptides described herein, it is contemplated that 
PRO variants can be prepared. PRO variants can be prepared by introducing appropriate nucleotide changes into 

20 the PRO DNA, and/or by synthesis of the desired PRO polypeptide. Those skilled in the art will appreciate that 
amino acid changes may alter post-translational processes of the PRO, such as changing the number or position 
of glycosylation sites or altering the membrane anchoring characteristics. 

Variations in the native full-length sequence PRO or in various domains of the PRO described herein, 
can be made, for example, using any of the techniques and guidelines for conservative and non-conservative 

25 mutations set forth, for instance, in U.S. Patent No. 5,364,934. Variations may be a substitution, deletion or 
insertion of one or more codons encoding the PRO that results in a change in the amino acid sequence of the 
PRO as compared with the native sequence PRO. Optionally the variation is by substitution of at least one amino 
acid with any other amino acid in one or more of the domains of the PRO. Guidance in determining which 
amino acid residue may be inserted, substituted or deleted without adversely affecting the desired activity may 

30 be found by comparing the sequence of the PRO with that of homologous known protein molecules and 
minimizing the number of amino acid sequence changes made in regions of high homology. Amino acid 
substitutions can be the result of replacing one amino acid with another amino acid having similar structural 
and/or chemical properties, such as the replacement of a leucine with a serine, i.e., conservative amino acid 
replacements. Insertions or deletions may optionally be in the range of about 1 to 5 amino acids. The variation 

35 allowed may be determined by systematically making insertions, deletions or substitutions of amino acids in the 
sequence and testing the resulting variants for activity exhibited by the full-length or mature native sequence. 



67 



WO 01/40466 



PCT7US00/32678 



PRO polypeptide fragments are provided herein. Such fragments may be truncated at the N-terminus 
or C-terminus, or may lack internal residues, for example, when compared with a full length native protein. 
Certain fragments lack amino acid residues that are not essential for a desired biological activity of the PRO 
polypeptide. 

PRO fragments may be prepared by any of a number of conventional techniques. Desired peptide 
5 fragments may be chemically synthesized. An alternative approach involves generating PRO fragments by 
enzymatic digestion, e.g., by treating the protein with an enzyme known to cleave proteins at sites defined by 
particular amino acid residues, or by digesting the DNA with suitable restriction enzymes and isolating the 
desired fragment. Yet another suitable technique involves isolating and amplifying a DNA fragment encoding 
a desired polypeptide fragment, by polymerase chain reaction (PCR). Oligonucleotides that define the desired 
10 termini of the DNA fragment are employed at the 5" and 3' primers in the PCR. Preferably, PRO polypeptide 
fragments share at least one biological and/or immunological activity with the native PRO polypeptide disclosed 
herein. 

In particular embodiments, conservative substitutions of interest are shown in Table 6 under the heading 
of preferred substitutions. If such substitutions result in a change in biological activity, then more substantial 
15 changes, denominated exemplary substitutions in Table 6, or as further described below in reference to amino 
acid classes, are introduced and the products screened. 

Table 6 



20 


Original 


Exemplary 


Preferred 




Residue 


Substitutions 


Substitutions 




Ala (A) 


val; leu; ile 


val 




Arg (R) 


lys; gin; asn 


lys 


25 


Asn (N) 


gin; his; lys; arg 


gin 




Asp (D) 


glu 


glu 




Cys (C) 


ser 


ser 




Gin (Q) 


asn 


asn 




Glu (E) 


asp 


asp 


30 


Gly (G) 


pro; ala 


ala 




His (H) 


asn; gin; lys; arg 


arg 




He (I) 


leu; val; met; ala; phe; 








norleucine 


leu 




Leu (L) 


norleucine; ile; val; 




35 




met; ala; phe 


ile 




Lys (K) 


arg; gin; asn 


arg 




Met (M) 


leu; phe; ile 


leu 




Phe (F) 


leu; val; ile; ala; tyr 


leu 




Pro (P) 


ala 


ala 


40 


Ser (S) 


thr 


thr 




Thr(T) 


ser 


ser 




Trp (W) 


tyr; phe 


tyr 




Tyr (Y) 


trp; phe; thr; ser 


phe 




Val (V) 


ile; leu; met; phe; 




45 




ala; norleucine 


leu 



68 



WO 01/40466 



PCT7US00/32678 



Substantial modifications in function or immunological identity of the PRO polypeptide are accomplished 
by selecting substitutions that differ significantly in their effect on maintaining (a) the structure of the polypeptide 
backbone in the area of the substitution, for example, as a sheet or helical conformation, (b) the charge or 
hydrophobicity of the molecule at the target site, or (c) the bulk of the side chain. Naturally occurring residues 
are divided into groups based on common side-chain properties: 
5 (1) hydrophobic: norleucine, met, ala, val, leu, ile; 

(2) neutral hydrophilic: cys, ser, thr; 

(3) acidic: asp, glu; 

(4) basic: asn, gin, his, lys, arg; 

(5) residues that influence chain orientation: gly, pro; and 
10 (6) aromatic: trp, tyr, phe. 

Non-conservative substitutions will entail exchanging a member of one of these classes for another class. 
Such substituted residues also may be introduced into the conservative substitution sites or, more preferably, into 
the remaining (non-conserved) sites. 

The variations can be made using methods known in the art such as oligonucleotide-mediated (site- 
15 directed) mutagenesis, alanine scanning, and PCR mutagenesis. Site-directed mutagenesis [Carter et al. , Nucl. 
Acids Res. . 13:4331 (1986); Zoller et al., Nucl. Acids Res. . 10:6487 (1987)], cassette mutagenesis [Wells et 
al.. Gene . 34:315 (1985)], restriction selection mutagenesis [Wells et al.. Philos. Trans. R. Soc. London Ser A . 
317:415 (1986)] or other known techniques can be performed on the cloned DNA to produce the PRO variant 
DNA. 

20 Scanning amino acid analysis can also be employed to identify one or more amino acids along a 

contiguous sequence. Among the preferred scanning amino acids are relatively small, neutral amino acids. Such 
amino acids include alanine, glycine, serine, and cysteine. Alanine is typically a preferred scanning amino acid 
among this group because it eliminates the side-chain beyond the beta-carbon and is less likely to alter the main- 
chain conformation of the variant [Cunningham and Wells, Science , 244 : 1081-1085 (1989)]. Alanine is also 

25 typically preferred because it is the most common amino acid. Further, it is frequently found in both buried and 
exposed positions [Creighton, The Proteins , (W.H. Freeman & Co., N.Y.); Chothia, J. Mol. Biol. . 150:1 
(1976)]. If alanine substitution does not yield adequate amounts of variant, an isoteric amino acid can be used. 

C. Modifications of PRO 

30 Covalent modifications of PRO are included within the scope of this invention. One type of covalent 

modification includes reacting targeted amino acid residues of a PRO polypeptide with an organic derivatizing 
agent that is capable of reacting with selected side chains or the N- or C- terminal residues of the PRO. 
Derivatization with bifunctional agents is useful, for instance, for crosslinking PRO to a water-insoluble support 
matrix or surface for use in the method for purifying anti-PRO antibodies, and vice-versa. Commonly used 

35 crosslinking agents include, e.g., l,l-bis(diazoacetyl)-2-phenylethane, glutaraldehyde, N-hydroxysuccinimide 
esters, for example, esters with 4-azidosalicylic acid, homobifunctional imidoesters, including disuccinimidyl 
esters such as 3,3'-dithiobis(succinimidylpropionate), bifunctional maleimides such as bis-N-maleimido-1,8- 



WO 01/40466 



PCT7US00/32678 



octane and agents such as methyl-3-[(p-azidophenyl)dithio]propioimidate. 

Other modifications include deamidation of glutaminyl and asparaginyl residues to the corresponding 
glutamyl and aspartyl residues, respectively, hydroxy lation of proline and lysine, phosphorylation of hydroxy 1 
groups of seryl or threonyl residues, methy lation of the a-amino groups of lysine, arginine, and histidine side 
chains [T.E. Creighton, Proteins: Structure and Molecular Properties . W.H. Freeman & Co., San Francisco, 
5 pp. 79-86 (1983)], acetylation of the N-terminal amine, and amidation of any C-terminal carboxyl group. 

Another type of covalent modification of the PRO polypeptide included within the scope of this 
invention comprises altering the native glycosylation pattern of the polypeptide. "Altering the native 
glycosylation pattern" is intended for purposes herein to mean deleting one or more carbohydrate moieties found 
in native sequence PRO (either by removing the underlying glycosylation site or by deleting the glycosylation 
10 by chemical and/or enzymatic means), and/or adding one or more glycosylation sites that are not present in the 
native sequence PRO. In addition, the phrase includes qualitative changes in the glycosylation of the native 
proteins, involving a change in the nature and proportions of the various carbohydrate moieties present. 

Addition of glycosylation sites to the PRO polypeptide may be accomplished by altering the amino acid 
sequence. The alteration may be made, for example, by the addition of, or substitution by, one or more serine 
15 or threonine residues to the native sequence PRO (for O-linked glycosylation sites). The PRO amino acid 
sequence may optionally be altered through changes at the DNA level, particularly by mutating the DNA 
encoding the PRO polypeptide at preselected bases such that codons are generated that will translate into the 
desired amino acids. 

Another means of increasing the number of carbohydrate moieties on the PRO polypeptide is by 
20 chemical or enzymatic coupling of glycosides to the polypeptide. Such methods are described in the art, e.g., 
in WO 87/05330 published 11 September 1987, and in Aplin and Wriston, CRC Crit. Rev. Biochem. . pp. 259- 
306 (1981). 

Removal of carbohydrate moieties present on the PRO polypeptide may be accomplished chemically 
or enzymatically or by mutational substitution of codons encoding for amino acid residues that serve as targets 
25 for glycosylation. Chemical deglycosylation techniques are known in the art and described, for instance, by 
Hakimuddin, et al., Arch. Biochem. Biophvs. . 259:52 (1987) and by Edge et al., Anal. Biochem. . 118:131 
(1981). Enzymatic cleavage of carbohydrate moieties on polypeptides can be achieved by the use of a variety 
of endo- and exo-glycosidases as described by Thotakura et al., Meth. Enzvmol. . 138:350 (1987). 

Another type of covalent modification of PRO comprises linking the PRO polypeptide to one of a variety 
30 of nonproteinaceous polymers, e.g. , polyethylene glycol (PEG), polypropylene glycol, or polyoxyalkylenes, in 
the manner set forth in U.S. Patent Nos. 4,640,835; 4,496,689; 4,301 , 144; 4,670,417; 4,791 , 192 or 4, 179,337. 

The PRO of the present invention may also be modified in a way to form a chimeric molecule 
comprising PRO fused to another, heterologous polypeptide or amino acid sequence. 

In one embodiment, such a chimeric molecule comprises a fusion of the PRO with a tag polypeptide 
35 which provides an epitope to which an anti-tag antibody can selectively bind. The epitope tag is generally placed 
at the amino- or carboxyl- terminus of the PRO. The presence of such epitope-tagged forms of the PRO can be 
detected using an antibody against the tag polypeptide. Also, provision of the epitope tag enables the PRO to 



70 



WO 01/40466 



PCT7US00/32678 



be readily purified by affinity purification using an anti-tag antibody or another type of affinity matrix that binds 
to the epitope tag. Various tag polypeptides and their respective antibodies are well known in the art. Examples 
include poly-histidine (poly-his) or poly-histidine-glycine (poly-his-gly) tags; the flu HA tag polypeptide and its 
antibody 12CA5 [Field et al., Mol. Cell. Biol. . 8:2159-2165 (1988)]; the c-myc tag and the 8F9, 3C7, 6E10, 
G4, B7 and 9E10 antibodies thereto [Evan et al., Molecular and Cellular Biology . 5:3610-3616 (1985)]; and the 
5 Herpes Simplex virus glycoprotein D (gD) tag and its antibody [Paborsky et al. , Protein Engineering . 3(6):547- 
553 (1990)]. Other tag polypeptides include the Flag-peptide [Hopp et al., BioTechnology . 6:1204-1210 
(1988)]; the KT3 epitope peptide [Martin et al., Science . 255:192-194 (1992)]; an a-tubulin epitope peptide 
[Skinner et al., J. Biol. Chem. . 266:15163-15166 (1991)]; and the T7 gene 10 protein peptide tag [Lutz- 
Freyermuth et al., Proc. Natl. Acad. Sci. USA . 87:6393-6397 (1990)]. 

10 In an alternative embodiment, the chimeric molecule may comprise a fusion of the PRO with an 

immunoglobulin or a particular region of an immunoglobulin. For a bivalent form of the chimeric molecule (also 
referred to as an "immunoadhesin"), such a fusion could be to the Fc region of an IgG molecule. The Ig fusions 
preferably include the substitution of a soluble (transmembrane domain deleted or inactivated) form of a PRO 
polypeptide in place of at least one variable region within an Ig molecule. In a particularly preferred 

15 embodiment, the immunoglobulin fusion includes the hinge, CH2 and CH3, or the hinge, CHI , CH2 and CH3 
regions of an IgGl molecule. For the production of immunoglobulin fusions see also US Patent No. 5,428, 130 
issued June 27, 1995. 

D. Preparation of PRO 

20 The description below relates primarily to production of PRO by culturing cells transformed or 

transfected with a vector containing PRO nucleic acid. It is, of course, contemplated that alternative methods, 
which are well known in the art, may be employed to prepare PRO. For instance, the PRO sequence, or 
portions thereof, may be produced by direct peptide synthesis using solid-phase techniques [see, e.g., Stewart 
etal.. Solid-Phase Peptide Synthesis . W.H. Freeman Co., San Francisco, CA (1969); Merrifield, J. Am. Chem. 

25 Soc . 85:2149-2154 (1963)]. In vitro protein synthesis may be performed using manual techniques or by 
automation. Automated synthesis may be accomplished, for instance, using an Applied Biosystems Peptide 
Synthesizer (Foster City, CA) using manufacturer's instructions. Various portions of the PRO may be 
chemically synthesized separately and combined using chemical or enzymatic methods to produce the full-length 
PRO. 

30 

1. Isolation of DNA Encoding PRO 
DNA encoding PRO may be obtained from a cDNA library prepared from tissue believed to possess 
the PRO mRNA and to express it at a detectable level. Accordingly, human PRO DNA can be conveniently 
obtained from a cDNA library prepared from human tissue, such as described in the Examples. The PRO- 
35 encoding gene may also be obtained from a genomic library or by known synthetic procedures (e.g. , automated 
nucleic acid synthesis). 



71 



WO 01/40466 



PCT7US00/32678 



Libraries can be screened with probes (such as antibodies to the PRO or oligonucleotides of at least 
about 20-80 bases) designed to identify the gene of interest or the protein encoded by it. Screening the cDNA 
or genomic library with the selected probe may be conducted using standard procedures, such as described in 
Sambrook et al. , Molecular Cloning: A Laboratory Manual (New York: Cold Spring Harbor Laboratory Press, 
1989). An alternative means to isolate the gene encoding PRO is to use PCR methodology [Sambrook et al., 
5 supra : Dieffenbach et al., PCR Primer: A Laboratory Manual (Cold Spring Harbor Laboratory Press, 1995)]. 

The Examples below describe techniques for screening a cDNA library. The oligonucleotide sequences 
selected as probes should be of sufficient length and sufficiently unambiguous that false positives are minimized. 
The oligonucleotide is preferably labeled such that it can be detected upon hybridization to DNA in the library 
being screened. Methods of labeling are well known in the art, and include the use of radiolabels like 32 P-labeled 
10 ATP, biotinylation or enzyme labeling. Hybridization conditions, including moderate stringency and high 
stringency, are provided in Sambrook et al., supra . 

Sequences identified in such library screening methods can be compared and aligned to other known 
sequences deposited and available in public databases such as GenBank or other private sequence databases. 
Sequence identity (at either the amino acid or nucleotide level) within defined regions of the molecule or across 
15 the full-length sequence can be determined using methods known in the art and as described herein. 

Nucleic acid having protein coding sequence may be obtained by screening selected cDNA or genomic 
libraries using the deduced amino acid sequence disclosed herein for the first time, and, if necessary, using 
conventional primer extension procedures as described in Sambrook et al., supra , to detect precursors and 
processing intermediates of mRNA that may not have been reverse-transcribed into cDNA. 

20 

2. Selection and Transformation of Host Cells 
Host cells are transfected or transformed with expression or cloning vectors described herein for PRO 
production and cultured in conventional nutrient media modified as appropriate for inducing promoters, selecting 
transformants, or amplifying the genes encoding the desired sequences. The culture conditions, such as media, 
25 temperature, pH and the like, can be selected by the skilled artisan without undue experimentation. In general, 
principles, protocols, and practical techniques for maximizing the productivity of cell cultures can be found in 
Mammalian Cell Biotechnology: a Practical Approach , M. Butler, ed. (IRL Press, 1991) and Sambrook et al., 
supra . 

Methods of eukaryotic cell transfection and prokaryotic cell transformation are known to the ordinarily 
30 skilled artisan, for example, CaCl 2 , CaP0 4 , liposome-mediated and electroporation. Depending on the host cell 
used, transformation is performed using standard techniques appropriate to such cells. The calcium treatment 
employing calcium chloride, as described in Sambrook et al., supra , or electroporation is generally used for 
prokaryotes. Infection with Agrobacterium tumefaciens is used for transformation of certain plant cells, as 
described by Shaw et al., Gene . 23:315 (1983) and WO 89/05859 published 29 June 1989. For mammalian cells 
35 without such cell walls, the calcium phosphate precipitation method of Graham and van der Eb, Virology . 
52:456-457 (1978) can be employed. General aspects of mammalian cell host system transfections have been 
described in U.S. Patent No. 4,399,216. Transformations into yeast are typically carried out according to the 



WO 01/40466 



PCT7US00/32678 



method of Van Solingen et al. , J.Bact. . 130:946 (1977) and Hsiao et al. , Proc. Natl. Acad. Sci. (USA) . 76:3829 
(1979). However, other methods for introducing DNA into cells, such as by nuclear microinjection, 
electroporation, bacterial protoplast fusion with intact cells, or polycations, e.g., polybrene, polyornithine, may 
also be used. For various techniques for transforming mammalian cells, see Keown et al., Methods in 
Enzvmology . 185:527-537 (1990) and Mansour et al., Nature . 336:348-352 (1988). 
5 Suitable host cells for cloning or expressing the DNA in the vectors herein include prokaryote, yeast, 

or higher eukaryote cells. Suitable prokaryotes include but are not limited to eubacteria, such as Gram-negative 
or Gram-positive organisms, for example, Enterobacteriaceae such as E. coli. Various E. coli strains are 
publicly available, such as E. coli K12 strain MM294 (ATCC 31,446); E. coli X1776 (ATCC 31,537); E. coli 
strain W3110 (ATCC 27,325) and K5 772 (ATCC 53,635). Other suitable prokaryotic host cells include 

10 Enterobacteriaceae such as Escherichia, e.g., E. coli, Enterobacter, Erwinia, Klebsiella, Proteus, Salmonella, 
e.g., Salmonella typhimurium, Serratia, e.g., Serratia marcescans, and Shigella, as well as Bacilli such as B. 
subtilis and B. licheniformis (e.g., B. licheniformis 41P disclosed in DD 266,710 published 12 April 1989), 
Pseudomonas such as P. aeruginosa, and Streptomyces . These examples are illustrative rather than limiting. 
Strain W3 1 10 is one particularly preferred host or parent host because it is a common host strain for recombinant 

15 DNA product fermentations. Preferably, the host cell secretes minimal amounts of proteolytic enzymes. For 
example, strain W31 10 may be modified to effect a genetic mutation in the genes encoding proteins endogenous 
to the host, with examples of such hosts including E. coli W3110 strain 1A2, which has the complete genotype 
tonA ; E. coli W3110 strain 9E4, which has the complete genotype tonA ptrS; E. coli W3110 strain 27C7 
(ATCC 55,244), which has the complete genotype tonA ptr3 phoA E15 (argF-lac)169 degP ompTkan'; E. coli 

20 W3110 strain 37D6, which has the complete genotype tonA ptr3 phoA El 5 (argF-lac)169 degP ompT rbs7 
ilvG kan'; E. coli W3110 strain 40B4, which is strain 37D6 with a non-kanamycin resistant degP deletion 
mutation; and an£. coli strain having mutant periplasmic protease disclosed in U.S. Patent No. 4,946,783 issued 
7 August 1990. Alternatively, in vitro methods of cloning, e.g., PCR or other nucleic acid polymerase 
reactions, are suitable. 

25 In addition to prokaryotes, eukaryotic microbes such as filamentous fungi or yeast are suitable cloning 

or expression hosts for PRO-encoding vectors. Saccharomyces cerevisiae is a commonly used lower eukaryotic 
host microorganism. Others include Schizosaccharomyces pombe (Beach and Nurse, Nature , 290: 140 [1981]; 
EP 139,383 published 2 May 1985); Kluyveromyces hosts (U.S. Patent No. 4,943,529; Fleer et al., 
Bio/Technology . 9:968-975 (1991)) such as, e.g., K. lactis (MW98-8C, CBS683, CBS4574; Louvencourt et 

30 al., J. Bacteriol. . 154(2):737-742 [1983]), K. fragilis (ATCC 12,424), K. bulgaricus (ATCC 16,045), K. 
wickeramii (ATCC 24,178), K. waltii (ATCC 56,500), K. drosophilarum (ATCC 36,906; Van den Berg et al., 
Bio/Technology . 8: 135 (1990)), K. thermotolerans, and K. marxianus; yarrowia (EP 402,226); Pichia pastoris 
(EP 183,070; Sreekrishna et al., J. Basic Microbiol. . 28:265-278 [1988]); Candida; Trichoderma reesia (EP 
244,234); Neurospora crassa (Case et al., Proc. Natl. Acad. Sci. USA . 76:5259-5263 [1979]); Schwanniomyces 

35 such as Schwanniomyces occidentalis (EP 394,538 published 31 October 1990); and filamentous fungi such as, 
e.g., Neurospora, Penicillium, Tolypocladium (WO 91/00357 published lOJanuary 1991), and Aspergillus hosts 
such as A. nidulans (Ballance et al., Biochem. Biophvs. Res. Commun. . 112:284-289 [1983]; Tilburn et al., 



